#   ClassName:  hs300_titles
#   description:
#   爬取东方财富公告大全第一页练习

#   @ author:郭海龙
#   @ Create：2024/3/3 18:20
#   @ Version:1.0
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import json
from selenium.webdriver.chrome.options import Options


# 配置无头浏览器
opt = Options()
opt.add_argument("--headless")
opt.add_argument('--disable-gpu')
opt.add_argument("--window-size=400,1600")  # 设置窗口大小
web = Chrome(options=opt)


def geturl(url):
    try:
        web.get(url)
    except Exception as e:
        print(f"请求出错！{e}")
    return BeautifulSoup(web.page_source, 'lxml')


def parse(soup):
    scripts = soup.find_all('script')
    for script in scripts:
        if 'var pagedata' in script.text:
            # print("script.text:" + script.text)
            # 找到json字符串的开始和结束位置
            start_index = script.text.find('[')
            end_index = script.text.rfind(']') + 1
            data_str = script.text[start_index:end_index]
            # 将json字符串转换成列表
            data = json.loads(data_str)
            # print(data)
            print("__________________________________________________________")
            for item in data:
                title = item['title']
                column_name = item['columns'][0]['column_name']
                notice_date = item['notice_date']
                print(f"公告标题：{title}")
                print(f"公告类型：{column_name}")
                print(f"公告日期：{notice_date}")
            print("_________________________爬取成功！___________________________")
            break



if __name__ == "__main__":
    URL = 'https://data.eastmoney.com/notices/'
    soup = geturl(URL)
    parse(soup)

